(click “Code” button to show/hide code)
Load libraries
LoadPackages <- function(packages) {
# Load or install packages if they aren't already loaded.
#
# Args:
# packages: a vector of package names
#
for (package in packages) {
if (!require(package, character.only=T, quietly=T)) {
if (!package %in% installed.packages()) install.packages(package)
library(package, character.only=T)
}
}
}
LoadPackages(c("rvest", "dplyr", "tibble", "stringr", "igraph", "networkD3",
"ggplot2"))
Urls that contain boxer’s record
urls = c('https://en.wikipedia.org/wiki/Anthony_Joshua',
'https://en.wikipedia.org/wiki/Deontay_Wilder',
'https://en.wikipedia.org/wiki/Tyson_Fury',
'https://en.wikipedia.org/wiki/Dillian_Whyte',
'https://en.wikipedia.org/wiki/Alexander_Povetkin',
'https://en.wikipedia.org/wiki/Jarrell_Miller',
'https://en.wikipedia.org/wiki/Kubrat_Pulev',
'https://en.wikipedia.org/wiki/Luis_Ortiz_(Cuban_boxer)',
'https://en.wikipedia.org/wiki/Dominic_Breazeale',
'https://en.wikipedia.org/wiki/Adam_Kownacki',
'https://en.wikipedia.org/wiki/Óscar_Rivas',
'https://en.wikipedia.org/wiki/Joseph_Parker_(boxer)',
'https://en.wikipedia.org/wiki/Agit_Kabayel',
'https://en.wikipedia.org/wiki/Filip_Hrgović',
'https://en.wikipedia.org/wiki/Sergei_Kuzmin_(boxer)',
"https://en.wikipedia.org/wiki/Joe_Joyce_(boxer)",
"https://en.wikipedia.org/wiki/Andy_Ruiz",
"https://en.wikipedia.org/wiki/Dereck_Chisora",
"https://en.wikipedia.org/wiki/Nathan_Gorman_(boxer)",
"https://en.wikipedia.org/wiki/Evgenyi_Romanov",
"https://en.wikipedia.org/wiki/Christian_Hammer",
"https://en.wikipedia.org/wiki/Robert_Helenius",
"https://en.wikipedia.org/wiki/Efe_Ajagba",
"https://en.wikipedia.org/wiki/Bryant_Jennings",
"https://en.wikipedia.org/wiki/Charles_Martin_(boxer)",
"https://en.wikipedia.org/wiki/Lucas_Browne",
"https://en.wikipedia.org/wiki/Marco_Huck",
"https://en.wikipedia.org/wiki/Hughie_Fury",
"https://en.wikipedia.org/wiki/Carlos_Takam",
"https://en.wikipedia.org/wiki/Tomasz_Adamek",
"https://en.wikipedia.org/wiki/Junior_Fa",
"https://en.wikipedia.org/wiki/Kyotaro_Fujimoto",
"https://en.wikipedia.org/wiki/Johann_Duhaupas",
"https://en.wikipedia.org/wiki/Tony_Yoka",
"https://en.wikipedia.org/wiki/Artur_Szpilka",
"https://en.wikipedia.org/wiki/Zhang_Zhilei",
"https://en.wikipedia.org/wiki/Alexander_Dimitrenko",
"https://en.wikipedia.org/wiki/Simon_Kean",
"https://en.wikipedia.org/wiki/Mariusz_Wach",
"https://en.wikipedia.org/wiki/Trevor_Bryan",
"https://en.wikipedia.org/wiki/Éric_Molina",
"https://en.wikipedia.org/wiki/Bogdan_Dinu",
"https://en.wikipedia.org/wiki/Izu_Ugonoh",
"https://en.wikipedia.org/wiki/Arnold_Gjergjaj",
"https://en.wikipedia.org/wiki/Tyrone_Spong",
"https://en.wikipedia.org/wiki/Franz_Rill",
"https://en.wikipedia.org/wiki/George_Arias_(boxer)",
"https://en.wikipedia.org/wiki/Erkan_Teper",
"https://en.wikipedia.org/wiki/Gerald_Washington_(boxer)",
"https://en.wikipedia.org/wiki/Demsey_McKean")
Extract boxer’s name from urls
boxer_names <- str_replace_all(urls, '.+wiki\\/', '') %>%
str_replace_all('_', ' ') %>% str_replace_all(' \\(.+', '')
Extract boxer’s win/loss record
# col names that indentifies a win/loss record table
req_names = c("Result", "Record", "Opponent")
record_lst = list()
# for each boxer extract win/loss record form wikipedia url
for (i in 1:length(urls)){
# read tables on webpage
tbl <- read_html(urls[i]) %>%
html_nodes(css = 'table')
# for each table check if it's the win/loss record
for (j in 1:length(tbl)){
# convert html table to dataframe
record_temp = tbl[j] %>% html_table(fill = TRUE) %>% as.data.frame()
# check if the names in table look like a table with win/loss record
if(length(intersect(req_names, record_temp %>% names())) == 3) {
# save the win/loss record to a list of records
record_lst[[i]] <- record_temp %>%
# create col for boxer's name
mutate(boxer = boxer_names[i])
# break out of loop because record was found
break
}
# vector of values for fist row
row1 <- record_temp %>% slice(1) %>% unlist(.,use.names = F)
# if the first row looks like col names of record table
if(length(intersect(req_names, row1)) == 3){
# use the first row values as col names
names(record_temp) <- row1
# remove first first row
record_temp <- record_temp[-1,]
# save the win/loss record to a list of records
record_lst[[i]] <- record_temp %>%
# create a col for boxer's name
mutate(boxer = boxer_names[i],
`No.` = as.integer(`No.`))
# break out of loop because record was found
break
}
}
}
Filter records for wins only
# remove null entries from list
# some boxers may not have win/loss records on their wikipeida page
record_lst <- record_lst[!sapply(record_lst, is.null)]
# saveRDS(record_lst, "../data/record_lst.rds")
# record_lst <- readRDS("../data/record_lst.rds")
record_df <- bind_rows(record_lst) %>%
# only wins
filter(Result %in% c("Win")) #%>%
# filter for only wins against other top boxers
#filter(Opponent %in% boxer_names) # might re-consider this
# includeing draws makes Fury and Wilder completely dominate the network
# based on PR scores
# boxer's records that weren't available
# setdiff(boxer_names, unique(record_df$boxer))
Create network graph object
relations <- record_df %>%
# centrality measure gives importance to nodes with more edges directed toward
# a node. Therefore, relationships is defined as opponent gives a loss to boxer
mutate(to = boxer, from = Opponent) %>%
select(from, to, Result, Type) %>%
# create weights for different win outcomes
# weights don't appear to affect page rank much
mutate(weight =
case_when(#Result == "Draw" ~ .5,
Result == "Win" & Type %in% c("DQ", "TD") ~ 1,
Result == "Win" & Type %in% c("SD", "DEC") ~ 1.25,
Result == "Win" & Type %in% c("PTS", "MD") ~ 1.5,
Result == "Win" & Type %in% c("UD") ~ 1.75,
Result == "Win" & Type %in% c("TKO", "KO", "RTD") ~ 2)) %>%
# sum the weights and remove dupes
group_by(from, to) %>%
mutate(weight = sum(weight), rn = row_number()) %>%
ungroup() %>%
filter(rn == 1) %>%
select(from, to, weight)
# names of boxers in sample
boxers_df = data.frame(name = union(relations$to, relations$from))
# create igraph object
boxers_g <- graph_from_data_frame(relations, directed = T, vertices = boxers_df)
# save igraph object. to be plotted using gephi
# write_graph(boxers_g, file = '../data/boxers_g.gml', format = 'gml')
Ranking
# page rank centrality
# PageRank theory suggests that an imaginary flow of losses that is randomly
# going from one boxer to another will eventually stop. The damping factor is
# the probability that the flow will continue at any step. Setting the damping
# factor to 1 implies the flow of random losses will continue and when its
# applied to the network it will end on a boxer with no losses making that
# boxer's importance higher. Boxer's with no losses "absorb" more of the
# importance from boxer's that they beat. (AJ has higher
# page rank value when damping is set to 1. When damping = 0 then they all have
# the same value)
# boxers aren't directly penalized for losses but the PR values sum up to 1 so
# when another boxer's PR value increases from wins other boxers value goes down
pr <- page_rank(boxers_g, algo = "prpack", vids = V(boxers_g),
directed = TRUE, damping = .9999) # damping = 1 wont converge
# create a dataframe of boxer's ranking and sort
ranking <- pr$vector %>%
as.data.frame() %>%
rownames_to_column("Boxer") %>%
rename(Page_Rank = 2) %>%
arrange(desc(Page_Rank)) %>%
mutate(RN = row_number()) %>%
select(RN, Boxer, Page_Rank) %>%
left_join(relations %>%
group_by(to) %>%
summarize(Wins = n()), by = c("Boxer" = "to"))
# saveRDS(ranking, "../data/ranking.rds")
ranking
D3 network graph
V(boxers_g)$div=c("Boxers")
# delete any vertices that have no edges attached.
boxers_g <- delete.vertices(boxers_g,degree(boxers_g)==0)
# create ids
nodes <- data.frame(vertex_attr(boxers_g))
nodes$ID <- as.numeric(nodes$name)-1
# data frame with edge list
edges <- data.frame(get.edgelist(boxers_g))
colnames(edges) <- c("source","target")
edges <- merge(edges, nodes[,c("name","ID")],by.x="source",by.y="name")
edges <- merge(edges, nodes[,c("name","ID")],by.x="target",by.y="name")
edges <- edges[,3:4]
colnames(edges)=c("source","target")
j <- forceNetwork(Links=edges, Nodes=nodes, Source = "source",
Target = "target", NodeID="name", Group="div",
fontSize=12, opacity = 0.8, zoom=T, legend=T)
j
# saveNetwork(j, file = '../data/boxers_d3.html')